TL;DR

Using a common dispersion parameter or a gene-wise parameter seems to have little effect on the estimation.

When the data are simulated from the model, it seems that even if only \(\mu\) depends on \(W\), the fact that we include it also in \(\pi\) is not problematic.

More details

Fitting the zinb model to the Fluidigm data, I’ve realized that the estimation of \(W\) is not good. I suspect that this has to do with the fact that \(W\) is used both in the regression of \(\mu\) and of \(\pi\), while it is common, for instance in the ZIFA model, to include \(W\) only in the regression of \(\mu\).

Here, I simulate data with \(W\) only in \(\mu\) and see if its estimation is problematic even when the model is correct (meaning data are simulated from a zinb model). But I will first check that we can estimate \(W\) when the model is correct in placing \(W\) both in \(\mu\) and \(\pi\).

Both \(\mu\) and \(\pi\) depend on \(W\)

To simulate realistic data, I will start from the Fluidigm data to estimate the zinb parameters and use those as the starting point from my simulations.

data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10
fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
bio <- as.factor(colData(fluidigm_high)$Biological_Condition)

I am using PCA to estimate the two factors that we want to simulate. Note that I’m simulating different dispersion parameters for each gene, so we can also test the robustness to the common dispersion parameter.

set.seed(2244)
pca <- prcomp(t(log1p(high)), center=TRUE, scale=TRUE)
true_W <- pca$x[,1:2]
mod <- model.matrix(~true_W)

zinb_high <- zinbFit(high, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3, commondispersion=FALSE)

true_alpha_mu <- zinb_high@beta_mu[-1,]
true_alpha_pi <- zinb_high@beta_pi[-1,]
true_beta_mu <- zinb_high@beta_mu[1,,drop=FALSE]
true_beta_pi <- zinb_high@beta_pi[1,,drop=FALSE]

sim_obj <- zinbModel(W=true_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi, beta_mu=true_beta_mu, beta_pi=true_beta_pi, zeta = zinb_high@zeta)

sim1 <- zinbSim(sim_obj, seed=3435)
counts1 <- t(sim1$counts)
boxplot(log1p(counts1), main="Simulated counts")

boxplot(log1p(high), main="Fluidigm observed counts")

filter <- apply(counts1>10, 1, sum)>=10
filtered1 <- counts1[filter,]
est1 <- zinbFit(filtered1, ncores=3, K=2, commondispersion=FALSE)
est2 <- zinbFit(filtered1, ncores=3, K=2, commondispersion=TRUE)
plot(true_W, col=cols[bio], main="True W", pch=19)

plot(est1@W, col=cols[bio], main="Estimated W (commondispersion=FALSE)", pch=19)

plot(est1@W[,1]~true_W[,1], xlab="True W", ylab="Estimated W", main="Factor 1")

plot(est1@W[,2]~true_W[,2], xlab="True W", ylab="Estimated W", main="Factor 2")

plot(est2@W, col=cols[bio], main="Estimated W (commondispersion=TRUE)", pch=19)

plot(est2@W[,1]~true_W[,1], xlab="True W", ylab="Estimated W", main="Factor 1")

plot(est2@W[,2]~true_W[,2], xlab="True W", ylab="Estimated W", main="Factor 2")

plot(est2@W[,1]~est1@W[,1], xlab="commondispersion=FALSE", ylab="commondispersion=TRUE", main="Factor 1")

plot(est2@W[,2]~est1@W[,2], xlab="commondispersion=FALSE", ylab="commondispersion=TRUE", main="Factor 2")

Only \(\mu\) depends on \(W\)

Using the same parameters as before but setting \(\alpha_pi=0\).

zinb_high <- zinbFit(high, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1L, commondispersion=FALSE)

true_alpha_mu <- zinb_high@beta_mu[-1,]
true_alpha_pi <- zinb_high@beta_pi[-1,]
true_beta_mu <- zinb_high@beta_mu[1,,drop=FALSE]
true_beta_pi <- zinb_high@beta_pi[1,,drop=FALSE]

sim_obj <- zinbModel(W=true_W, alpha_mu=true_alpha_mu, alpha_pi=matrix(data=0, ncol=NROW(high), nrow=NCOL(true_W)), beta_mu=true_beta_mu, beta_pi=true_beta_pi, zeta = zinb_high@zeta)

sim2 <- zinbSim(sim_obj, seed=3225)
counts2 <- t(sim2$counts)
boxplot(log1p(counts2), main="Simulated counts", ylim=c(0, 12))

boxplot(log1p(high), main="Fluidigm observed counts")

filter <- apply(counts2>10, 1, sum)>=10
filtered2 <- counts2[filter,]
est3 <- zinbFit(filtered2, ncores=3, K=2, commondispersion=FALSE)
est4 <- zinbFit(filtered2, ncores=3, K=2, commondispersion=TRUE)
plot(true_W, col=cols[bio], main="True W", pch=19)

plot(est3@W, col=cols[bio], main="Estimated W (commondispersion=FALSE)", pch=19)

plot(est3@W[,1]~true_W[,1], xlab="True W", ylab="Estimated W", main="Factor 1")

plot(est3@W[,2]~true_W[,2], xlab="True W", ylab="Estimated W", main="Factor 2")

plot(est4@W, col=cols[bio], main="Estimated W (commondispersion=TRUE)", pch=19)

plot(est4@W[,1]~true_W[,1], xlab="True W", ylab="Estimated W", main="Factor 1")

plot(est4@W[,2]~true_W[,2], xlab="True W", ylab="Estimated W", main="Factor 2")

plot(est4@W[,1]~est3@W[,1], xlab="commondispersion=FALSE", ylab="commondispersion=TRUE", main="Factor 1")

plot(est4@W[,2]~est3@W[,2], xlab="commondispersion=FALSE", ylab="commondispersion=TRUE", main="Factor 2")